********************************************************************************
********************************************************************************
** Robustness analysis 5
** Weighting and multiple imputation
********************************************************************************
********************************************************************************

** Load data
use "$dataraw_path\data_ftna_publication.dta", clear


** Create peer variables for all students including non-sample student
egen peers_score_core_sd2 = std(peers_score_core) if peers_number_core>4


** Weighting:
probit sample private uncommon_name female i.year ///
	peers_score_core_sd2 peers_fail_share peers_as_share i.gpa_ftna_core_int ///
	i.district_id, ///
	cl(school_id)
margins, dydx(private uncommon_name female peers_score_core_sd2 ///
	peers_fail_share peers_as_share) atmeans

predict sample_prob, pr
gen sample_prob_inv = 1/sample_prob
replace peers_score_core_sd2 = peers_score_core_sd

eststo rob5_2: areg gpa_ftna_core_sd private uncommon_name ///
	female peers_score_core_sd2 peers_fail_share peers_as_share ///
	gpa_psle_other_sd [pweight=sample_prob_inv] ///
	if sample==1, cl(school_id) a(group_id)

drop peers_score_core_sd2


** Multiple imputation approach (takes several hours even on external server)
egen gpa_ftna_core_sd2 = std(gpa_ftna_core)
egen peers_score_core_sd2 = std(peers_score_core)

keep id_ftna id_psle gpa_ftna_core_sd2 private female uncommon_name ///
	peers_score_core_sd2 peers_fail_share peers_as_share gpa_psle_core_sd ///
	gpa_psle_other_sd school_id_psle_int school_id_int year name_length group_id 


** Generate distibution of primary schools attended within each secondary
*	school (used later for drawing primary schools for non-merged students)
preserve
	keep if school_id_int>=1 & school_id_int<700
	save "$datawork_path\temp_impute1.dta", replace
restore

preserve
	keep if school_id_int>=700 & school_id_int<1900
	save "$datawork_path\temp_impute2.dta", replace
restore

preserve
	keep if school_id_int>=1900 & school_id_int<3200
	save "$datawork_path\temp_impute3.dta", replace
restore

preserve
	keep if school_id_int>=3200 & school_id_int<4800
	save "$datawork_path\temp_impute4.dta", replace
restore

use "$datawork_path\temp_impute1.dta", clear

bys school_id_int school_id_psle_int: gen temp1 = _n if school_id_psle_int!=.
replace temp1 = . if temp1!=1
bys school_id_int: egen temp2 = sum(temp1)
sum temp2
forvalues i = 1/`r(max)' {
	gen school`i' = .
	gen fraction`i' = .
	gen fraction_cum`i' = .
}
drop temp1 temp2

by school_id_int: egen merged_sec = sum(1) if school_id_psle_int!=.
by school_id_int school_id_psle_int: egen merged_sec_pri = sum(1)
replace merged_sec_pri = . if school_id_psle_int==.
gen fraction = merged_sec_pri / merged_sec

levelsof school_id_int, local(school_ftna) // this step takes approx. 10 hours (alternative: split data up and run simultanously)
foreach name1 in `school_ftna' {
	local i = 1
	local j = 0
	di `i'
	di `name1'
	levelsof school_id_psle_int if school_id_int==`name1', local(school_psle)
	foreach name2 in `school_psle' {
		replace school`i' = `name2' if school_id_int==`name1'
		qui: levelsof fraction if school_id_int==`name1' & ///
			school_id_psle_int==`name2', local(temp2)
		replace fraction`i' = `temp2' if school_id_int==`name1'
		if `i'==1 {
			replace fraction_cum1 = fraction1 if school_id_int==`name1'
		}
		else {
			replace fraction_cum`i' = fraction_cum`j' + fraction`i' ///
				if school_id_int==`name1'
		}
		local i = `i' + 1
		local j = `j' + 1
	}
}

save "$data_path\stata_work\temp_impute1_2.dta", replace

use "$data_path\temp_impute1_2.dta", clear
append using "$data_path\stata_work\temp_impute2_2.dta"
append using "$data_path\stata_work\temp_impute3_2.dta"
append using "$data_path\stata_work\temp_impute4_2.dta"

forvalues i = 1/347 {
	drop fraction`i'
}

save "$datawork_path\tableb1_data.dta", replace // data with primary school distribution within secondary school


set matsize 11000

//mi extract 0, clear

mi set mlong

mi register imputed gpa_psle_core_sd gpa_psle_other_sd school_id_psle_int 
mi register regular gpa_ftna_core_sd2 private female peers_score_core_sd2 ///
	peers_fail_share peers_as_share name_length year uncommon_name ///
	school_id_int

mi impute chained (regress) gpa_psle_core_sd gpa_psle_other_sd = ///
	gpa_ftna_core_sd2 female peers_score_core_sd2 peers_fail_share ///
	peers_as_share uncommon_name i.year i.school_id_int, ///
	add(10) rseed(1234) force dots // impute primary school test scores (2-3 hours)

gen random = . // draw primary school for each imputation based on within secondary school distribution
gen draw_school = school_id_psle_int
forvalues i = 1/10 {
	set seed 1234
	replace random = runiform()
	forvalues j = 1/347 {
		replace draw_school = school`j' if random<fraction_cum`j' & ///
			draw_school==. & _mi_m==`i'
		local k = `j'+1
		if `j'<347 {
			replace draw_school = school`j' if draw_school==. & _mi_m==`i' & ///
				school`j'!=. & school`k'==. & random>=fraction_cum`j'
		}
	}
}
sort _mi_m school_id_int school_id_psle_int _mi_id

sum gpa_psle_core_sd // calculate new fixed effect
local min = floor(`r(min)'*10)/10
local max = ceil(`r(max)'*10)/10+0.48 // 0.48 is difference between gpa_psle_core_sd values
egen gpa_psle_core_group = cut(gpa_psle_core_sd), at(`min'(0.48)`max') icodes
egen group_id2 = group(gpa_psle_core_group year draw_school)

mi estimate, dots post: areg gpa_ftna_core_sd2 private female ///
	uncommon_name peers_score_core_sd2 peers_fail_share peers_as_share ///
	gpa_psle_other_sd, ///
	cl(school_id_int) a(group_id2) // takes 15 min
eststo rob5_3

gen r2 = . // estimate model m times and calculate average R^2
forvalues m = 1/10 {
	qui: areg gpa_ftna_core_sd2 private female uncommon_name ///
		peers_score_core_sd2 peers_fail_share peers_as_share ///
		gpa_psle_other_sd ///
		if inlist(_mi_m,0,`m'), cl(school_id_int) a(group_id2)
	replace r2 = e(r2) in `m'
}
egen r2_mean = mean(r2)
tab r2_mean
drop r2


** Output (remember to include probit marginal effects manually)
esttab rob5_2 rob5_3 using "$out_path\tableb1.tex", replace se ///
stats(N r2, fmt(%12.3gc) labels("\(N\)" "\(R^2\)")) compress nomtitles ///
starlevels("" 0.01) substitute(\_ _) b(3) ///
/*KEEP*/k(private female uncommon_name peers_score_core_sd2 ///
peers_fail_share peers_as_share gpa_psle_other_sd) ///
/*ORDER*/o(private female uncommon_name peers_score_core_sd2 ///
peers_fail_share peers_as_share gpa_psle_other_sd) ///
/*LABELS*/varl(private "\$Private_s$" female "\$Female$" ///
uncommon_name "\$\textit{Uncommon name}$" ///
peers_score_core_sd2 "\$\textit{Peers PSLE}_{s}$" ///
peers_fail_share "\$\textit{Peers failed}_{s}$" ///
peers_as_share "\$\textit{Peers with A}_{s}$" ///
gpa_psle_other_sd "\$\textit{GPA other}_{p} \textit{ (PSLE)}$")
